{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ba40bc36-f5cf-42a0-b709-a7fbfc1330e4",
   "metadata": {},
   "source": [
    "# Connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7ec4d6af-ec98-47db-96e9-06b5fd572f3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "load_dotenv()\n",
    " \n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebbce755-db46-4585-be25-75f1f822aa35",
   "metadata": {},
   "source": [
    "# Load Model from hugging face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ad02f589-3e98-4727-9015-df872ae83b00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-02-08 09:30:35,871 INFO : Establishing connection to Elasticsearch\n",
      "2024-02-08 09:30:35,886 INFO : Connected to cluster named 'elasticsearch' (version: 8.12.0)\n",
      "2024-02-08 09:30:35,886 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n",
      "STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n",
      "STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:318] Completed Stage: Collection\n",
      "STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n",
      "2024-02-08 09:30:40,055 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'\n",
      "2024-02-08 09:30:40,176 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'\n",
      "2024-02-08 09:30:40,400 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n",
      "2024-02-08 09:30:40,571 INFO : Uploading model definition\n",
      "100%|███████████████████████████████████████| 87/87 [00:03<00:00, 25.50 parts/s]\n",
      "2024-02-08 09:30:44,045 INFO : Uploading model vocabulary\n",
      "2024-02-08 09:30:44,082 INFO : Starting model deployment\n",
      "2024-02-08 09:30:45,700 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n"
     ]
    }
   ],
   "source": [
    "MODEL_ID = \"sentence-transformers__all-minilm-l6-v2\"\n",
    "\n",
    "!eland_import_hub_model --url https://elastic:xnLj56lTrH98Lf_6n76y@localhost:9200 \\\n",
    "\t--hub-model-id sentence-transformers/all-MiniLM-L6-v2 \\\n",
    "\t--task-type text_embedding \\\n",
    "\t--ca-cert ./http_ca.crt \\\n",
    "\t--clear-previous \\\n",
    "\t--start"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ddbf4f0-179b-4f00-b1b2-561cd43b8197",
   "metadata": {},
   "source": [
    "# Chunk and Infer in pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "eecdfede-1b36-405a-8c6f-bcf9cb9171e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Setup the pipeline\n",
    "CHUNK_SIZE = 400\n",
    "\n",
    "client.ingest.put_pipeline(\n",
    "  id=\"chunk_text_to_passages\",\n",
    "  processors=[\n",
    "    {\n",
    "      \"script\": {\n",
    "        \"description\": \"Chunk body_content into sentences by looking for . followed by a space\",\n",
    "        \"lang\": \"painless\",\n",
    "        \"source\": \"\"\"\n",
    "          String[] envSplit = /((?<!M(r|s|rs)\\.)(?<=\\.) |(?<=\\!) |(?<=\\?) )/.split(ctx['text']);\n",
    "          ctx['passages'] = new ArrayList();\n",
    "          int i = 0;\n",
    "          boolean remaining = true;\n",
    "          if (envSplit.length == 0) {\n",
    "            return\n",
    "          } else if (envSplit.length == 1) {\n",
    "            Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)\n",
    "          } else {\n",
    "            while (remaining) {\n",
    "              Map passage = ['text': envSplit[i++]];\n",
    "              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}\n",
    "              if (i == envSplit.length) {remaining = false}\n",
    "              ctx['passages'].add(passage)\n",
    "            }\n",
    "          }\n",
    "          \"\"\",\n",
    "        \"params\": {\n",
    "          \"model_limit\": CHUNK_SIZE\n",
    "        }\n",
    "      }\n",
    "    },\n",
    "    {\n",
    "      \"foreach\": {\n",
    "        \"field\": \"passages\",\n",
    "        \"processor\": {\n",
    "          \"inference\": {\n",
    "            \"field_map\": {\n",
    "              \"_ingest._value.text\": \"text_field\"\n",
    "            },\n",
    "            \"model_id\": MODEL_ID,\n",
    "            \"target_field\": \"_ingest._value.vector\",\n",
    "            \"on_failure\": [\n",
    "              {\n",
    "                \"append\": {\n",
    "                  \"field\": \"_source._ingest.inference_errors\",\n",
    "                  \"value\": [\n",
    "                    {\n",
    "                      \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n",
    "                      \"pipeline\": \"ml-inference-title-vector\",\n",
    "                      \"timestamp\": \"{{{ _ingest.timestamp }}}\"\n",
    "                    }\n",
    "                  ]\n",
    "                }\n",
    "              }\n",
    "            ]\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cccea01b-b484-44e4-bb4c-f1d614859bfd",
   "metadata": {},
   "source": [
    "# Setup Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "52e82220-76a4-47b3-94c5-91a0e664b34c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'chunk_passages_example'})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "INDEX_NAME = \"chunk_passages_example\"\n",
    "\n",
    "# Setup the index\n",
    "client.indices.create( \n",
    "  index=INDEX_NAME, \n",
    "  settings={\n",
    "    \"index\": {\n",
    "      \"default_pipeline\": \"chunk_text_to_passages\"\n",
    "    }\n",
    "  },\n",
    "  mappings={\n",
    "    \"dynamic\": \"true\",\n",
    "    \"properties\": {\n",
    "      \"passages\": {\n",
    "        \"type\": \"nested\",\n",
    "        \"properties\": {\n",
    "          \"vector\": {\n",
    "            \"properties\": {\n",
    "              \"predicted_value\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"index\": True,\n",
    "                \"dims\": 384,\n",
    "                \"similarity\": \"dot_product\"\n",
    "              }\n",
    "            }\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27ea1c4a-f05a-4bd4-bbb4-cab169ea9eb3",
   "metadata": {},
   "source": [
    "# Add some Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e01d9595-b1c0-45ff-83be-1b2cfa9df13c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from elasticsearch import helpers\n",
    "\n",
    "with open('workplace-docs.json') as f:\n",
    "   docs = json.load(f)\n",
    "\n",
    "operations = [\n",
    "    {\n",
    "      \"_index\": INDEX_NAME,\n",
    "      \"_id\": i,\n",
    "      \"text\": doc[\"content\"],\n",
    "      \"name\": doc[\"name\"]\n",
    "    } for i, doc in enumerate(docs)\n",
    "]\n",
    "\n",
    "# Add the documents to the index directly\n",
    "response = helpers.bulk(\n",
    "  client,\n",
    "  operations,\n",
    "  refresh=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24f765d8-0132-4ac9-b347-2be2c0af83e2",
   "metadata": {},
   "source": [
    "# Aside: Pretty printing Elasticsearch responses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "025bf58e-1cbc-4185-a69f-a0c5690d4d5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_response(response):\n",
    "    if len(response['hits']['hits']) == 0:\n",
    "        print('Your search returned no results.')\n",
    "    else:\n",
    "        for hit in response['hits']['hits']:\n",
    "            id = hit['_id']\n",
    "            score = hit['_score']\n",
    "            doc_title = hit['_source']['name']\n",
    "            passage_text = \"\"\n",
    "\n",
    "            for passage in hit['inner_hits']['passages']['hits']['hits']:\n",
    "                passage_text += passage[\"fields\"][\"passages\"][0]['text'][0] + \"\\n\\n\"\n",
    "\n",
    "            pretty_output = (f\"\\nID: {id}\\nDoc Title: {doc_title}\\nPassage Text:\\n{passage_text}\\nScore: {score}\\n\")\n",
    "            print(pretty_output)\n",
    "            print(\"---\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "197803ff-aaa1-4da2-ac3f-6ffd37860830",
   "metadata": {},
   "source": [
    "# Making queries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d084c988-3c9e-4187-831c-80145619fe63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "ID: 0\n",
      "Doc Title: Work From Home Policy\n",
      "Passage Text:\n",
      "Effective: March 2020\n",
      "Purpose\n",
      "\n",
      "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n",
      "Scope\n",
      "\n",
      "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities.\n",
      "\n",
      "\n",
      "Score: 0.8549611\n",
      "\n",
      "---\n",
      "\n",
      "ID: 7\n",
      "Doc Title: Intellectual Property Policy\n",
      "Passage Text:\n",
      "This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n",
      "\n",
      "Scope\n",
      "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n",
      "\n",
      "Definitions\n",
      "a.\n",
      "\n",
      "\n",
      "Score: 0.7664342\n",
      "\n",
      "---\n",
      "\n",
      "ID: 4\n",
      "Doc Title: Company Vacation Policy\n",
      "Passage Text:\n",
      "Purpose\n",
      "\n",
      "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes.\n",
      "\n",
      "\n",
      "Score: 0.725452\n",
      "\n",
      "---\n"
     ]
    }
   ],
   "source": [
    "response = client.search(\n",
    "  index=INDEX_NAME,\n",
    "  knn={\n",
    "    \"inner_hits\": {\n",
    "      \"size\": 1,\n",
    "      \"_source\": False,\n",
    "      \"fields\": [\n",
    "        \"passages.text\"\n",
    "      ]\n",
    "    },\n",
    "    \"field\": \"passages.vector.predicted_value\",\n",
    "    \"k\": 3,\n",
    "    \"num_candidates\": 100,\n",
    "    \"query_vector_builder\": {\n",
    "      \"text_embedding\": {\n",
    "        \"model_id\": MODEL_ID,\n",
    "        \"model_text\": \"Whats the work from home policy?\"\n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")\n",
    "\n",
    "pretty_response(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "588441ff-e430-4b4c-9af9-50e6aab2b5c6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
